In [2]:
#import warnings
#warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import re
import plotly
import plotly.offline as py
from plotly.graph_objs import *
from ipywidgets import interact
plotly.offline.init_notebook_mode(connected=True)
import matplotlib.pyplot as plt
import seaborn as sns
In [67]:
data = pd.read_csv("https://raw.githubusercontent.com/jasonjgy2000/Cuny/master/Data%20608/Homework%204/Data/riverkeeper_data_2013.csv")
In [68]:
data.head(5)
Out[68]:
In [5]:
data.dtypes
Out[5]:
The Date column will be converted from a string data type to date. Removing less and more than signs from EnterCount and convert it to int datatype. Changing Site from string to category
In [32]:
data['Date'] = pd.to_datetime(data["Date"])
data['Site'] = data['Site'].astype('category')
data['EnteroCount'] = data['EnteroCount'].str.replace(r'\<|>', '').astype('int64')
# remove Na
data = data.dropna()
In [7]:
data[3230:3234]
Out[7]:
In [33]:
data.dtypes
Out[33]:
In [34]:
# water is unacceptable if a single sample is greater than 110 Enterococcus/100 mL
# water is unacceptable if five(5) or more samples weighted average is more than 30 Enterococcus/100 mL
data['Year'] = data['Date'].apply(lambda x: x.year)
placesToSwim = data.groupby(['Site','Year'])['EnteroCount'].sum()
placesToSwim = placesToSwim.to_frame().reset_index()
placesToSwim['RecordCount'] = data.groupby(['Site','Year'])['EnteroCount'].transform('count')
placesToSwim['yearSum'] = placesToSwim.groupby(['Year'])['RecordCount'].transform('sum')
placesToSwim['WeightedMean'] = (placesToSwim['EnteroCount'] * placesToSwim['RecordCount'])/ placesToSwim['yearSum']
In [35]:
# add unaccecptable/ acceptable check
placesToSwim['Classification'] = np.where(((placesToSwim.WeightedMean > 110) & (placesToSwim.RecordCount < 5 )) |
((placesToSwim.WeightedMean > 30) & (placesToSwim.RecordCount > 4 ))
,'unacceptable', 'acceptable')
In [36]:
placesToSwim.head()
Out[36]:
In [ ]:
#Worse Places to Swim
In [63]:
def plot_it(Year,Filter,Sorting):
placesToSwimData = placesToSwim[(placesToSwim['Year']==int(Year)) & (placesToSwim['Classification']==Filter)]
if Filter == 'acceptable':
placesToSwimData = placesToSwimData.head(10)
else:
placesToSwimData = placesToSwimData.tail(10)
if Sorting == 'ascending':
placesToSwimData = placesToSwimData.sort_values(['WeightedMean'],ascending=[True])
else:
placesToSwimData = placesToSwimData.sort_values(['WeightedMean'],ascending=[False])
fig = {
'data': [
{
'x': placesToSwimData['Site'],
'y': placesToSwimData['WeightedMean'],
'name': Year,
'type': 'bar'
}
],
'layout': Layout(xaxis=XAxis(title='Site'), yaxis=YAxis(title='Weighted Mean'))
}
py.iplot(fig)
In [64]:
interact(plot_it, Year=('2006', '2007','2008', '2009', '2010', '2011', '2012', '2013'),
Filter =('acceptable','unacceptable'),Sorting=('ascending','descending'))
Out[64]:
In [39]:
years = [2006,2007,2008,2009,2010,2011,2012,2013]
for year in years:
test = placesToSwim[((placesToSwim['Year']==year) & (placesToSwim['Classification'] == 'acceptable') )]
test = test.sort_values(['WeightedMean'],ascending=True).head(10)[['Site','WeightedMean']].reset_index(drop=True)
test.columns = ['Site','Weighted EnteroCount']
print '---------------------------------------------'
print 'Best Places to swim for the Year '+ str(year)
print test
In [40]:
for year in years:
test = placesToSwim[((placesToSwim['Year']==year) & (placesToSwim['Classification'] == 'unacceptable') )]
test = test.sort_values(['WeightedMean'],ascending=False).head(10)[['Site','WeightedMean']].reset_index(drop=True)
test.columns = ['Site','Weighted EnteroCount']
print '---------------------------------------------'
print 'Worst Places to swim for the Year '+ str(year)
print test
In [91]:
testRegularly = data.groupby(['Site'])['EnteroCount'].count()
testRegularly = testRegularly.to_frame().reset_index()
testRegularly.columns = ['Site','Number of Tests']
testRegularly = testRegularly.sort_values(['Number of Tests'],ascending=False).head(10).reset_index(drop=True)
testRegularly
Out[91]:
In [41]:
for year in years:
testingGap = data[data['Year'] == year]
testingGap = testingGap.sort_values('Date').groupby(['Site'])['Date'].apply(lambda x: x.diff().mean()).to_frame().reset_index()
# remove Nat
testingGap = testingGap[testingGap.Date.notnull()]
testingGap['Date']= testingGap['Date'].apply(lambda x: x.days)
testingGap.columns = ['Site','Gap(Days)']
print '---------------------------------------------------'
print 'Year '+ str(year)
print testingGap.sort_values('Gap(Days)',ascending=False).head(10).reset_index(drop=True)
In [60]:
def plot_gaps(Year,Sorting):
testingGap = data[data['Year']== int(Year)]
testingGap = testingGap.sort_values('Date').groupby(['Site'])['Date'].apply(lambda x: x.diff().mean()).to_frame().reset_index()
# remove Nat
testingGap = testingGap[testingGap.Date.notnull()]
testingGap['Date']= testingGap['Date'].apply(lambda x: x.days)
testingGap.columns = ['Site','Gap(Days)']
if Sorting == 'ascending':
testingGap = testingGap.sort_values('Gap(Days)',ascending=True).head(10)
else:
testingGap = testingGap.sort_values('Gap(Days)',ascending=False).head(10)
fig = {
'data': [
{
'x': testingGap['Site'],
'y': testingGap['Gap(Days)'],
'name': Year,
'type': 'bar'
}
],
'layout': Layout(xaxis=XAxis(title='Site'), yaxis=YAxis(title='Gap(Days)'))
}
py.iplot(fig)
In [61]:
interact(plot_gaps, Year=('2006', '2007','2008', '2009', '2010', '2011', '2012', '2013'),Sorting=('ascending','descending'))
Out[61]:
In [45]:
relationship = data[['EnteroCount','FourDayRainTotal']]
relationship.head()
Out[45]:
In [65]:
trace = Scatter(
x = relationship['FourDayRainTotal'],
y = relationship['EnteroCount'],
mode = 'markers',
name='Scatter Plot'
)
plotData = [trace]
layout = Layout(
title='Scatter Plot',
xaxis=dict(
title='Four Day Rain Total',
titlefont=dict(
family='Courier New, monospace',
size=18,
color='#7f7f7f'
)
),
yaxis=dict(
title='Entero Count',
titlefont=dict(
family='Courier New, monospace',
size=18,
color='#7f7f7f'
)
)
)
fig = Figure(data=plotData, layout=layout)
py.iplot(fig, filename='basic-scatter')
There seems to be a strong relationship between rainfall and Entero Count. It seems that once there is more than 2.5 inches of rain the Entero readings values are lower.
In [66]:
import seaborn as sns
corr = relationship.corr()
sns.heatmap(corr,
xticklabels=corr.columns.values,
yticklabels=corr.columns.values)
Out[66]: